# Script to perform market share regressions

####### INITIALIZE

### Load packages
require(data.table)
require(bit64)
require(dplyr)
require(ggplot2)
require(stringr)

### Set directories
home.dir <- "path-to-data-appendix/3_Analysis_of_TAQ_data"
data.dir <- file.path(home.dir, 'WRDS_Server/Output')
temp.dir <- file.path(home.dir, 'Final_Output/Tables')
fig.dir <- file.path(home.dir, 'Final_Output/Figures')

######______________________________________FUNCTIONS______________________________________

# NOTE: Exchange code 'Q' represents "NASDAQ (Tape C)". We have aggregated all the NASDAQ observations into exchange code "T" in the SAS files.

getEXname <- function(x){
  name <- "NA"
  if(x == "A") name <- "NYSE MKT LLC"
  if(x == "B") name <- "NASDAQ OMX BX, Inc."
  if(x == "N") name <- "New York Stock Exchange LLC"
  if(x == "P") name <- "NYSE Arca, Inc."
  if(x == "C") name <- "National Stock Exchange, Inc."
  if(x == "T") name <- "The Nasdaq Stock Market LLC (Tape A and B)"
  if(x == "Q") name <- "NASDAQ Tape C"
  if(x == "D") name <- "FINRA"
  if(x == "X") name <- "NASDAQ OMX PSX LLC"
  if(x == "M") name <- "Chicago Stock Exchange, Inc."
  if(x == "Z") name <- "BATS BZX Exchange, Inc."
  if(x == "Y") name <- "BATS BYZ Exchange, Inc."
  if(x == "K") name <- "BATS EDGX Exchange, Inc."
  if(x == "J") name <- "BATS EDGA Exchange, Inc."
  if(x == "W") name <- "CBOE Stock Exchange LLC"
  if(x == "V") name <- "The Investors' Exchange"
  if(x == "I") name <- "International Securities Exchange LLC"
  return(name)
} 
getEXname <- Vectorize(getEXname)

# Write function to calculate R2
getR2 <- function(data, lhs, vars, method = "means") {
  
  # Unlist vars
  vars <- unlist(vars)
  
  # To match the lm to the means, you need to surpress the intercept (add a 0 on the RHS)
  # and then calculate r2 <- 1 - sum(r$resid^2)/ss_tot and calculate 
  # ss_tot = sum((data[,y] - mean(data[,y]))^2). We use the means calculation, but leave 
  # this one in place in case we later want to change the method.
  
  # Confirmed that the lm method does the same as means, and they both only look at the interaction between Exchange and other variables
  # Do not include the other variables by themselves.
  if(method == "lm"){
    
    # These two lines interacts all the variables with exchange indicator
    # Keep in mind that the only variables are the exchange and the interactions with exchange.
    # EX, EX:Listed_Market, EX:Symbol, etc...
    vars <- paste("EX", vars, sep = ":")
    vars <- gsub("EX:EX", "EX", vars)
    
    # Regress and obtain the residual sum of squares
    formula   <- paste(lhs, paste(vars, collapse = " + "), sep = " ~ ")
    r         <- lm(as.formula(formula), data)
    rss <- sum(r$residuals^2)
    
    # Obtain the total sum of squares
    y <- data[valid.rows][[lhs]]
    tss <- sum((y - mean(y))^2)
    
    # Formula to calculate R2
    r2 <- 1 - rss/tss
  }
  
  if(method == "means"){
    
    
    # Identify rows with no missing data
    # NOTE: These are the rows that would be included in a hypothetical regression
    valid.rows <- !is.na(data[,get(lhs)])
    for(var in vars){
      valid.rows <- valid.rows & !is.na(data[,get(var)])
    }
    
    # Calculate total sum of squares
    formula.t    <- paste(lhs, 1, sep = " ~ ")
    group_vars.t <- aggregate(as.formula(formula.t), data[valid.rows], function(x) sum((x - mean(x))^2))
    ss_tot       <- sum(group_vars.t[, lhs], na.rm = TRUE)
    
    # Calculate residual sum of squares
    formula    <- paste(lhs, paste(vars, collapse = " + "), sep = " ~ ")
    group_vars <- aggregate(as.formula(formula), data[valid.rows], function(x) sum((x - mean(x))^2))
    ss_res     <- sum(group_vars[, lhs], na.rm = TRUE)
    
    # Calculate R2
    r2 <- 1 - ss_res/ss_tot
  }
  
  return(r2)
}



R2_fn <- function(data, period, sample, all_lhs, all_rhs) {
  
  # All symbols
  
  # Replaces "Sample 1" with "f_Sample_1" 
  sample_col <- paste("f",str_replace(sample,"[[:punct:]\\s]+","_"), sep = "_")
  
  # Creates a data.frame that stores regression details and the R2's that are to be calculated.
  # This is what "expand.grid" produces:
  #
  # sample cut period       lhs                                                                                     char
  # 1 Sample 1 All   2015 Sh_S_top8                                                                                       EX
  # 2 Sample 1 All   2015 Sh_S_top8                                                                        EX, f_NYSE_Listed
  # 3 Sample 1 All   2015 Sh_S_top8 EX, f_NYSE_Listed, c_Market_Cap_Bucket, c_Price_Standard, c_Share_Code, c_NAICS_Industry
  # 4 Sample 1 All   2015 Sh_S_top8                                                                 EX, SYM_ROOT, SYM_SUFFIX
  results.all  <- expand.grid(sample = sample, cut = "All", period = period, lhs = all_lhs,
                              char=all_rhs, stringsAsFactors = FALSE)
  
  # Uses the getR2 function to calculate the R2 for each specification. 
  results.all$r2 <- NA
  for(i in 1:nrow(results.all)){
    print(paste(Sys.time(), "Processing", i, "of", nrow(results.all)))
    results.all$r2[i]        <- with(results.all[i,], getR2(data[data[[sample_col]] == 1], lhs, char, method = "means"))
  }
  
  # Top100
  
  # Replaces "Sample 1" with "f_Sample_1" 
  sample_col_100 <- paste(paste("f",str_replace(sample,"[[:punct:]\\s]+","_"), sep = "_"), "T100", sep = "_")
  
  # Creates a data.frame that stores regression details and the R2's that are to be calculated
  results.t100 <- expand.grid(sample = sample, cut = "Top 100", period = period, lhs=all_lhs,
                              char=all_rhs, stringsAsFactors = FALSE)
  
  # Uses the getR2 function to calculate the R2 for each specification.
  results.t100$r2 <- NA
  for(i in 1:nrow(results.t100)){
    print(paste(Sys.time(), "Processing", i, "of", nrow(results.t100)))
    results.t100$r2[i]        <- with(results.t100[i,], getR2(data[data[[sample_col_100]] == 1], lhs, char, method = "means"))
  }
  results <- rbind(results.all, results.t100)
  
  return(results)
}

######______________________________________s_ijt : symbol-date-exchange regressions______________________________________
period <- "2015"
  
  ## Load data
  df <- read.table(file.path(data.dir, paste("Market_Shares_", period, ".txt", sep = "")), sep = "|",
                   stringsAsFactors = FALSE, header = TRUE)

  # Convert to data table
  df <- data.table(df)
  
  # Remove FINRA, IEX, ISE
  df <- df[!(EX == "D"),]
  df <- df[!(EX == "V"),]
  df <- df[!(EX == "I"),]
  
  # Clean data
  df[ , SYM_SUFFIX := ifelse(is.na(SYM_SUFFIX), "", SYM_SUFFIX)]
  df[ , DATE := as.Date(as.character(DATE), format = "%Y%m%d")]
  df[ , ex := EX]
  
  
  # Get the exchange name
  df$ExchangeName <- apply(as.matrix(df$EX), MARGIN = 1, FUN = getEXname)
  

  # Convert industry codes to character
  df$c_NAICS_Industry <- as.character(df$c_NAICS_Industry)
  df$c_NAICS_Sector   <- as.character(df$c_NAICS_Sector)
  df$c_SICCD          <- as.character(df$c_SICCD)
  
  
  ###### Top 8
  
  top8 <- c("The Nasdaq Stock Market LLC (Tape A and B)","New York Stock Exchange LLC","NYSE Arca, Inc.","BATS BZX Exchange, Inc.","BATS EDGX Exchange, Inc.",
            "NASDAQ OMX BX, Inc.","BATS EDGA Exchange, Inc.","BATS BYZ Exchange, Inc.")
  
  # Getting the top 8 total volume
  
  top8_df <- df[df$ExchangeName %in% top8,]
  
  top8_df <- top8_df %>% 
    dplyr::group_by(SYM_ROOT, DATE) %>%
    dplyr::mutate(Vol_S_top8 = sum(Vol_S_Standard))
  
  top8_df$Sh_S_top8 <- top8_df$Vol_S_Standard / top8_df$Vol_S_top8
  
  top8_df <- data.table(top8_df)
  
  # Setting up regression vars
  
  all.lhs     <- c("Sh_S_top8")
  all.char    <- list(c("EX"),
                      c("EX", "f_NYSE_Listed"),
                      c("EX", "f_NYSE_Listed", "c_Market_Cap_Bucket", "c_Price_Standard", "c_Share_Code", "c_NAICS_Industry"),
                      c("EX", "SYM_ROOT", "SYM_SUFFIX"))
  
  # regressions
  
  top8_sample1 <- R2_fn(top8_df, period, "Sample 1", all.lhs, all.char)
  save(top8_sample1, file = file.path(temp.dir, paste("sample_1_top8_", period, ".Rdata", sep = "")))


# __________________________________________________________Print to LATEX__________________________________________________________

result_lists <- list(top8_sample1)

for(results in result_lists) {
   
    sample <- ifelse(results$sample[1] == "Sample 1", "Sample1")

    to.keep <- c("EX", "EX; f_NYSE_Listed", "EX; SYM_ROOT; SYM_SUFFIX",
                   "EX; f_NYSE_Listed; c_Market_Cap_Bucket; c_Price_Standard; c_Share_Code; c_NAICS_Industry")

    
    results$keep <- FALSE
    for(i in 1:nrow(results)){
      results$keep[i] <- paste(unlist(results$char[i]), collapse = "; ") %in% to.keep
    }
    results <- results[results$keep,]
    
    for(i in 1:nrow(results)){
      if("SYM_SUFFIX" %in% unlist(results$char[i])) results$char[i] <- list(c("EX", "SYM_ROOT"))
    }
    
    
    # Write function to clean variable names
    cleanText <- function(text){
      text  <- gsub("EX", "Exchange_{j}", text)
      text  <- gsub("f_NYSE_Listed",       "Exchange_{j} \\\\times \\\\textit{ListedNYSE}_{c}",        text)
      text  <- gsub("LISTED_MARKET",  "Exchange_{j} \\\\times ListingExchange_{c}", text)
      text  <- gsub("c_Market_Cap_Bucket", "Exchange_{j} \\\\times MarketCapBucket_{c}", text)
      text  <- gsub("c_Price_Standard",      "Exchange_{j} \\\\times PriceBucket_{c}",     text)
      text  <- gsub("c_NAICS_Sector",      "Exchange_{j} \\\\times SectorNAICS_{c}",     text)
      text  <- gsub("c_NAICS_Industry",    "Exchange_{j} \\\\times \\\\textit{IndustryNAICS}_{c}",   text)
      text  <- gsub("c_SICCD",             "Exchange_{j} \\\\times IndustrySIC_{c}",     text)
      text  <- gsub("c_Share_Code",        "Exchange_{j} \\\\times ShareCode_{c}",       text)
      text  <- gsub("SYM_ROOT",            "Exchange_{j} \\\\times Symbol_{i}",          text)
    }
    
    # Clean variable names
    results$char.text.1 <- "NA"
    results$char.text.2 <- "NA"
    results$char.text.3 <- "NA"
    results$char.text.4 <- "NA"
    results$char.text.5 <- "NA"
    results$char.text.6 <- "NA"
    
    for(i in 1:nrow(results)){
      n <- length(unlist(results$char[i]))
      for(j in 1:6){
        if(j <= n) results[i,paste("char.text.", j, sep = "")] <- cleanText(unlist(results$char[i])[j])
      }
    }
    
    
    # Write function to prepare latex lines
    texResults <- function(subset, filename) {
      local     <- subset
      local$n   <- lapply(local$char, length)
      local$tex <- NA
      
      local <- local[order(local$r2),]
      
      for(i in 1:nrow(local)){
        
        n <- local$n[i]
        
        if(i %% 2 == 1){
          if(n == 1){
            local$tex[i] <- sprintf(" $ s_{ijt} = %s + \\varepsilon_{ijt} $ & %0.3f \\\\", local$char.text.1[i], local$r2[i])
          } else if (n == 2){
            local$tex[i] <- sprintf(" $ s_{ijt} = %s + %s + \\varepsilon_{ijt} $ & %0.3f \\\\", local$char.text.1[i], local$char.text.2[i], local$r2[i])
          } else if (n == 3){
            local$tex[i] <- sprintf(" $ s_{ijt} = %s + %s $ & %0.3f \\\\
                                    \\hspace{0.920in}   $ + %s + \\varepsilon_{ijt} $ &  \\\\", local$char.text.1[i], local$char.text.2[i], local$r2[i], local$char.text.3[i])
          } else if (n == 4){
            local$tex[i] <- sprintf(" $ s_{ijt} = %s + %s $ & %0.3f \\\\
                                    \\hspace{0.920in}   $ + %s $ &  \\\\
                                    \\hspace{0.920in}   $ + %s + \\varepsilon_{ijt} $ &  \\\\", local$char.text.1[i], local$char.text.2[i], local$r2[i], local$char.text.3[i], local$char.text.4[i])
          } else if (n == 5){
            local$tex[i] <- sprintf(" $ s_{ijt} = %s + %s $ & %0.3f \\\\
                                    \\hspace{0.920in}   $ + %s $ &  \\\\
                                    \\hspace{0.920in}   $ + %s $ &  \\\\
                                    \\hspace{0.920in}   $ + %s + \\varepsilon_{ijt} $ &  \\\\", local$char.text.1[i], local$char.text.2[i], local$r2[i], local$char.text.3[i], local$char.text.4[i], local$char.text.5[i])
          } else if (n == 6){
            local$tex[i] <- sprintf(" $ s_{ijt} = %s + %s $ & %0.3f \\\\
                                    \\hspace{0.920in}   $ + %s $ &  \\\\
                                    \\hspace{0.920in}   $ + %s $ &  \\\\
                                    \\hspace{0.920in}   $ + %s $ &  \\\\
                                    \\hspace{0.920in}   $ + %s + \\varepsilon_{ijt} $ &  \\\\", local$char.text.1[i], local$char.text.2[i], local$r2[i], local$char.text.3[i], local$char.text.4[i], local$char.text.5[i], local$char.text.6[i])
          }
          }
        
        
        
        
        if(i %% 2 == 0){
          if(n == 1){
            local$tex[i] <- sprintf("\\rowcolor{Gray} $ s_{ijt} = %s + \\varepsilon_{ijt} $ & %0.3f \\\\", local$char.text.1[i], local$r2[i])
          } else if (n == 2){
            local$tex[i] <- sprintf("\\rowcolor{Gray} $ s_{ijt} = %s + %s + \\varepsilon_{ijt} $ & %0.3f \\\\", local$char.text.1[i], local$char.text.2[i], local$r2[i])
          } else if (n == 3){
            local$tex[i] <- sprintf("\\rowcolor{Gray}  $ s_{ijt} = %s + %s $ & %0.3f \\\\
                                  \\rowcolor{Gray}    \\hspace{0.920in}   $ + %s + \\varepsilon_{ijt} $ &  \\\\", local$char.text.1[i], local$char.text.2[i], local$r2[i], local$char.text.3[i])
          } else if (n == 4){
            local$tex[i] <- sprintf("\\rowcolor{Gray} $ s_{ijt} = %s + %s $ & %0.3f \\\\
                                  \\rowcolor{Gray}    \\hspace{0.920in}   $ + %s $ &  \\\\
                                  \\rowcolor{Gray}    \\hspace{0.920in}   $ + %s + \\varepsilon_{ijt} $ &  \\\\", local$char.text.1[i], local$char.text.2[i], local$r2[i], local$char.text.3[i], local$char.text.4[i])
          } else if (n == 5){
            local$tex[i] <- sprintf("\\rowcolor{Gray} $ s_{ijt} = %s + %s $ & %0.3f \\\\
                                  \\rowcolor{Gray}   \\hspace{0.920in}   $ + %s $ &  \\\\
                                  \\rowcolor{Gray}    \\hspace{0.920in}   $ + %s $ &  \\\\
                                  \\rowcolor{Gray}    \\hspace{0.920in}   $ + %s + \\varepsilon_{ijt} $ &  \\\\", local$char.text.1[i], local$char.text.2[i], local$r2[i], local$char.text.3[i], local$char.text.4[i], local$char.text.5[i])
          } else if (n == 6){
            local$tex[i] <- sprintf("\\rowcolor{Gray} $ s_{ijt} = %s + %s $ & %0.3f \\\\
                                  \\rowcolor{Gray}   \\hspace{0.920in}   $ + %s $ &  \\\\
                                  \\rowcolor{Gray}   \\hspace{0.920in}   $ + %s $ &  \\\\
                                  \\rowcolor{Gray}    \\hspace{0.920in}   $ + %s $ &  \\\\
                                  \\rowcolor{Gray}    \\hspace{0.920in}   $ + %s + \\varepsilon_{ijt} $ &  \\\\", local$char.text.1[i], local$char.text.2[i], local$r2[i], local$char.text.3[i], local$char.text.4[i], local$char.text.5[i], local$char.text.6[i])
          }
          
        }
        
        
        
          }
      
      sink(file = file.path(fig.dir, filename))
      cat(local$tex)
      sink()
      
    }
    
    top8_name <- paste(paste(paste("top8_xsection_reg",period, sep = "_"), sample, sep = "_"), sep = "")
    
    texResults(results[results$cut == "All",], paste(top8_name, "all.tex", sep = "_"))
    texResults(results[results$cut == "Top 100",], paste(top8_name, "t100.tex", sep = "_"))
}